Clustering with Other Algorithms.Rmd
“Far better an approximate answer to the right question, which is often vague, than an exact answer to the wrong question, which can always be made precise.”
This vignette serves as a code repository for clustering algorithms that take distances or similarities as input. To be clear, it is not a referendum on which clustering algorithm is best because there is no such thing as a best distance or best clustering algorithm or best validation method. Every clustering problem is a domain-specific problem that needs patience, iteration and domain-expertise to acquire usable results.
With that out of the way, please feel free to recommend clustering algorithms we may have missed by lodging an issue at https://github.com/bmuchmore/PreciseDist/issues
Data and set-up comes from the Cell Cycle Vignette - Experiment 5: Minkowski 100x. See that vignette for more details.
library(PreciseDist)
data("data_cell_cycle")
str(data_cell_cycle[1:5])
library(dplyr)
cell_cycle_data <- data_cell_cycle %>%
dplyr::select(-Cell_cycle) %>%
as.matrix()
cell_cycle_labels <- data_cell_cycle %>%
dplyr::select(Cell_cycle) %>%
as.matrix()
cell_cycle_minkowski_params <- seq(0.45, 0.54, length.out = 10)
cell_cycle_minkowski_funcs <- precise_func_fact(
func = "minkowski",
params = cell_cycle_minkowski_params
)
library(future)
library(doFuture)
registerDoFuture()
plan(multiprocess, workers = 10)
cell_cycle_minkowski_dists <- cell_cycle_data %>%
as.matrix() %>%
precise_dist(
dist_funcs = cell_cycle_minkowski_funcs,
time_series = FALSE,
partitions = 10,
suffix = "cell_minkowski_",
file = "/absolute_path/to_somewhere/with_full_name/inclusing_the/file_extension.rds",
parallel = TRUE,
local_timeout = Inf,
verbose = TRUE
)
cell_cycle_minkowski_transformed <- cell_cycle_minkowski_dists %>%
precise_transform(transform = "laplacian")
cell_cycle_minkowski_fused <- precise_fusion(
cell_cycle_minkowski_transformed,
fusion = "fuse",
verbose = TRUE
)
cell_cycle_minkowski_graph <- precise_graph(
data = cell_cycle_minkowski_fused,
method = 1,
n_neighbors = 50,
spread = 10,
min_dist = 0.001,
bandwidth = 10,
parallel = TRUE,
verbose = TRUE
)
Now that we have the graph, we will extract the distance, and call precise_transform()
to ensure that it is, in fact, in distance format. Please note though that some functions require similarities. With those functions, we coax the distance into a similarity using proxy::proxy::pr_dist2simil()
:
library(dbscan)
hdbscan_clusters <- cell_cycle_data %>%
dbscan::hdbscan(
minPts = 5,
xdist = as.dist(cell_cycle_for_clustering),
gen_hdbscan_tree = FALSE,
gen_simplified_tree = FALSE
) %>%
.[["cluster"]] %>%
as.character() %>%
map_chr(~paste("Cluster_", .x)) %>%
as_tibble() %>%
select(Hdbscan_Clusters = value)
library(cluster)
diana_clusters <- cell_cycle_for_clustering %>%
as.dist() %>%
cluster::diana(
diss = TRUE,
metric = NULL,
stand = FALSE,
stop.at.k = FALSE,
keep.diss = FALSE,
keep.data = FALSE,
trace.lev = 0
) %>%
stats::cutree(k = 3, h = NULL) %>%
as.character() %>%
map_chr(~paste("Cluster_", .x)) %>%
as_tibble() %>%
select(Diana_Clusters = value)
library(cluster)
pam_clusters <- cell_cycle_for_clustering %>%
as.dist() %>%
cluster::pam(
k = 3,
diss = TRUE,
metric = NULL,
medoids = NULL,
stand = FALSE,
cluster.only = TRUE,
do.swap = TRUE,
keep.diss = FALSE,
keep.data = FALSE,
pamonce = FALSE,
trace.lev = 0
) %>%
as.character() %>%
map_chr(~paste("Cluster_", .x)) %>%
as_tibble() %>%
select(Pam_Clusters = value)
library(apcluster)
ap_clusters <- cell_cycle_for_clustering %>%
proxy::pr_dist2simil() %>%
apcluster::apcluster(
p = NA,
q = NA,
maxits = 1000,
convits = 100,
lam = 0.9,
includeSim = FALSE,
details = FALSE,
nonoise = FALSE,
seed = NA
)
ap_clusters <- ap_clusters@idx %>%
as.character() %>%
map_chr(~paste("Cluster_", .x)) %>%
as_tibble() %>%
select(AP_Clusters = value)
library(apcluster)
apk_clusters <- cell_cycle_for_clustering %>%
proxy::pr_dist2simil() %>%
apcluster::apclusterK(
K = 3,
prc = 10,
bimaxit = 20,
exact = FALSE,
maxits = 1000,
convits = 100,
lam = 0.9,
includeSim = FALSE,
details = FALSE,
nonoise = FALSE,
seed = NA,
verbose = FALSE
)
apk_clusters <- apk_clusters@idx %>%
as.character() %>%
map_chr(~paste("Cluster_", .x)) %>%
as_tibble() %>%
select(APk_Clusters = value)